Read data

df <- read_excel("wuhan_blood_sample_data_Jan_Feb_2020.xlsx")

Clean dataset

patients_df <- df %>% group_by(`Admission time`, `Discharge time`, gender, age, outcome) %>%
summarise(PATIENT_ID = sum(PATIENT_ID, na.rm = TRUE), `Total blood tests` = n()) 

patients_df <- patients_df %>%
    mutate(`Days in hospital` = ceiling(difftime(`Discharge time`, `Admission time`, units = "days")))


df <- full_join(patients_df %>% ungroup() %>% select(`Admission time`, PATIENT_ID), df %>% select(-PATIENT_ID), by="Admission time")

df$gender<-ifelse(df$gender==1, 'Male', 'Female') 
df <- df %>% mutate(gender = as.factor(gender))

patients_df$gender<-ifelse(patients_df$gender==1, 'Male', 'Female') 
patients_df <- patients_df %>% mutate(gender = as.factor(gender))

df$outcome<-ifelse(df$outcome==1, 'Death', 'Survival')
df <- df %>% mutate(outcome = as.factor(outcome))

patients_df$outcome<-ifelse(patients_df$outcome==1, 'Death', 'Survival')
patients_df <- patients_df %>% mutate(outcome = as.factor(outcome))

Patients summary

patients_summary <- patients_df %>% ungroup() %>% select(-c(age, PATIENT_ID))

tbl_summary(
    patients_summary,
    by = outcome,
    label = gender ~ "Gender",
) %>%
    add_n() %>%
    modify_header(label = "") %>%
    add_overall() %>%
    bold_labels() 
Overall, N = 3751 N Survival, N = 2011 Death, N = 1741
Gender 375
Male 224 (60%) 98 (49%) 126 (72%)
Female 151 (40%) 103 (51%) 48 (28%)
Total blood tests 16 (9, 21) 375 16 (12, 20) 14 (7, 24)
Days in hospital 10 (5, 16) 375 14 (10, 18) 6 (3, 10)

1 Statistics presented: n (%); Median (IQR)

 ggplot(patients_df, aes(x=age,fill=gender)) + geom_histogram(binwidth = 1) + facet_grid(. ~ gender) + scale_x_continuous(name="Age", limits=c(min(df$age), max(df$age)), breaks = seq(0, 100, by=10)) + scale_y_continuous(name = "Number of patients", limits = c(0,10), breaks = seq(0,10, by=1)) +
    theme_minimal()

Patients grouped by outcome, age and gender

 ggplot(patients_df, aes(x=age,fill=outcome)) + geom_histogram(binwidth = 1) + facet_grid(outcome ~ gender) + scale_x_continuous(name="Age", limits=c(min(df$age), max(df$age)), breaks = seq(0, 100, by=10)) + scale_y_continuous(name = "Number of patients", limits = c(0,9), breaks = seq(0,9, by=1)) +
    theme_minimal()

Patients grouped by outcome and hospitalization duration

ggplot(patients_df, aes(x=`Days in hospital`, fill=outcome)) + geom_histogram(binwidth = 1) + facet_grid(. ~ outcome)  + ylab("Number of patients") +
    theme_minimal()

Patients grouped by outcome and total blood tests taken

ggplot(patients_df, aes(x=`Total blood tests`, fill=outcome)) + geom_histogram(binwidth = 1) + facet_grid(. ~ outcome)  + ylab("Number of patients") +
  scale_x_continuous(name="Total blood tests", limits=c(0, 60), breaks = seq(0, 60, by=10)) +
   scale_y_continuous(name = "Number of patients", limits = c(0,20), breaks = seq(0,20, by=2)) +
    theme_minimal()

Biomarkers

no_dates_df <- df %>% select(-c('Admission time', 'Discharge time', 'RE_DATE', 'PATIENT_ID', 'age', 'gender' ))
tbl_summary(
    no_dates_df,
    by = outcome,
    missing = "no"
            ) %>%
    add_n() %>%
    modify_header(label = "**Biomarker**") %>%
  modify_spanning_header(c("stat_1", "stat_2") ~ "**Final patient outcome related to the test**") %>%
    bold_labels() 
Biomarker N Final patient outcome related to the test
Death, N = 2,9051 Survival, N = 3,2151
Hypersensitive cardiac troponinI 507 70 (18, 631) 3 (2, 7)
hemoglobin 975 123 (110, 135) 127 (116, 138)
Serum chloride 975 104 (100, 111) 101 (99, 103)
Prothrombin time 662 16.3 (15.0, 18.2) 13.6 (13.1, 14.1)
procalcitonin 459 0.38 (0.14, 1.13) 0.04 (0.02, 0.06)
eosinophils(%) 957 0.00 (0.00, 0.10) 0.70 (0.00, 1.80)
Interleukin 2 receptor 268 1,180 (807, 1,603) 529 (400, 742)
Alkaline phosphatase 930 83 (64, 123) 60 (50, 75)
albumin 934 28 (24, 31) 36 (34, 39)
basophil(%) 957 0.10 (0.10, 0.20) 0.20 (0.10, 0.40)
Interleukin 10 267 11 (6, 17) 5 (5, 8)
Total bilirubin 930 14 (10, 25) 8 (6, 12)
Platelet count 957 112 (55, 174) 229 (176, 290)
monocytes(%) 958 3.0 (2.0, 4.7) 8.2 (6.3, 10.0)
antithrombin 330 80 (70, 92) 93 (86, 103)
Interleukin 8 268 30 (18, 61) 11 (7, 19)
indirect bilirubin 906 6.2 (4.2, 9.2) 4.9 (3.4, 7.1)
Red blood cell distribution width 923 13.20 (12.40, 14.40) 12.20 (11.80, 12.80)
neutrophils(%) 957 92 (88, 95) 66 (56, 76)
total protein 931 62 (57, 68) 68 (65, 72)
Quantification of Treponema pallidum antibodies 279 0.06 (0.04, 0.07) 0.05 (0.04, 0.07)
Prothrombin activity 659 66 (56, 78) 94 (88, 103)
HBsAg 279 0.01 (0.00, 0.02) 0.00 (0.00, 0.01)
mean corpuscular volume 957 91.3 (87.1, 96.4) 89.8 (86.8, 91.9)
hematocrit 957 35.9 (32.5, 39.8) 37.1 (34.3, 39.9)
White blood cell count 1,127 12 (8, 17) 6 (4, 8)
Tumor necrosis factorα 268 11 (8, 17) 8 (6, 10)
mean corpuscular hemoglobin concentration 957 342 (331, 350) 343 (335, 350)
fibrinogen 566 3.92 (2.44, 5.63) 4.40 (3.56, 5.34)
Interleukin 1β 268 5.0 (5.0, 5.0) 5.0 (5.0, 5.0)
Urea 936 11 (7, 17) 4 (3, 5)
lymphocyte count 957 0.46 (0.31, 0.69) 1.25 (0.87, 1.62)
PH value 384 6.50 (6.00, 7.41) 6.50 (6.00, 7.00)
Red blood cell count 1,127 4.0 (3.6, 4.6) 4.2 (3.8, 4.7)
Eosinophil count 957 0.00 (0.00, 0.01) 0.03 (0.00, 0.09)
Corrected calcium 914 2.35 (2.27, 2.44) 2.37 (2.27, 2.44)
Serum potassium 980 4.60 (4.04, 5.27) 4.28 (3.92, 4.62)
glucose 775 9.1 (6.9, 13.3) 5.7 (5.0, 7.6)
neutrophils count 957 10.8 (7.0, 15.2) 3.5 (2.4, 5.2)
Direct bilirubin 930 8 (5, 14) 4 (2, 5)
Mean platelet volume 862 11.30 (10.70, 12.20) 10.40 (9.90, 11.00)
ferritin 283 1,636 (928, 2,517) 504 (235, 834)
RBC distribution width SD 923 43.7 (39.9, 48.5) 39.5 (37.6, 41.4)
Thrombin time 566 17.30 (15.80, 19.75) 16.40 (15.60, 17.30)
(%)lymphocyte 958 4 (2, 7) 24 (16, 33)
HCV antibody quantification 279 0.07 (0.04, 0.11) 0.06 (0.04, 0.08)
D-D dimer 630 19 (3, 21) 1 (0, 1)
Total cholesterol 931 3.32 (2.72, 3.88) 3.93 (3.39, 4.48)
aspartate aminotransferase 935 38 (25, 59) 21 (17, 29)
Uric acid 934 245 (166, 374) 240 (193, 304)
HCO3- 934 21.8 (18.8, 24.7) 24.7 (22.8, 26.7)
calcium 979 2.00 (1.90, 2.08) 2.17 (2.10, 2.25)
Amino-terminal brain natriuretic peptide precursor(NT-proBNP) 475 1,467 (516, 4,578) 64 (23, 166)
Lactate dehydrogenase 934 593 (431, 840) 220 (189, 278)
platelet large cell ratio 862 35 (30, 42) 28 (23, 33)
Interleukin 6 272 66 (30, 142) 8 (2, 21)
Fibrin degradation products 330 114 (18, 150) 4 (4, 4)
monocytes count 957 0.36 (0.20, 0.58) 0.43 (0.32, 0.58)
PLT distribution width 862 13.60 (12.10, 15.93) 11.70 (10.70, 13.00)
globulin 930 34.1 (30.2, 38.2) 31.8 (29.5, 35.2)
γ-glutamyl transpeptidase 930 42 (27, 79) 29 (19, 46)
International standard ratio 659 1.31 (1.17, 1.48) 1.04 (0.99, 1.09)
basophil count(#) 957 0.010 (0.010, 0.030) 0.010 (0.010, 0.020)
2019-nCoV nucleic acid detection 501
-1 57 (100%) 444 (100%)
mean corpuscular hemoglobin 957 31.20 (29.90, 32.70) 30.70 (29.60, 31.90)
Activation of partial thromboplastin time 568 40 (36, 45) 39 (35, 43)
High sensitivity C-reactive protein 737 114 (65, 191) 7 (2, 35)
HIV antibody quantification 278 0.08 (0.07, 0.11) 0.09 (0.08, 0.11)
serum sodium 975 142 (138, 148) 140 (138, 141)
thrombocytocrit 862 0.15 (0.10, 0.21) 0.24 (0.19, 0.30)
ESR 383 36 (16, 59) 26 (13, 40)
glutamic-pyruvic transaminase 931 26 (18, 44) 21 (15, 36)
eGFR 936 72 (43, 91) 100 (85, 114)
creatinine 936 88 (68, 130) 64 (54, 83)

1 Statistics presented: Median (IQR); n (%)

Finding correlation

Finding correlation between outcome and other variables. Ignoring Admission time, PATIENT_ID, RE_DATE and Discharge time and dropping outcome which correlation with itself is equal to 1.0.

cor_df <-df

cor_df$outcome<-ifelse(cor_df$outcome=='Death', 1, 0)
cor_df$gender<-ifelse(cor_df$gender=='Male', 1, 0)

cor_df <- cor_df %>% rename(isMale=gender)
cor_df <- cor_df %>% rename(Death=outcome)

cor_df <- cor_df %>% select (-c(`Admission time`, PATIENT_ID, RE_DATE, `Discharge time` ))

corrMatrix <- cor(cor_df[sapply(cor_df, is.numeric)], use='pairwise.complete.obs')

correlation_df <- as.data.frame(corrMatrix)

correlation_df %>% rownames_to_column('variable') %>% filter(variable != 'Death') %>% select(variable, Death) %>% mutate(Death = abs(Death)) %>%
  arrange(desc(Death)) %>%
  rename(`Outcome correlation` = Death) %>%
  head(10) 
variable Outcome correlation
neutrophils(%) 0.7258383
(%)lymphocyte 0.7230203
albumin 0.6892432
Prothrombin activity 0.6562405
High sensitivity C-reactive protein 0.6524696
D-D dimer 0.6368244
Lactate dehydrogenase 0.6230357
neutrophils count 0.6071749
Fibrin degradation products 0.6044110
age 0.5860651

Use of correlation

In following section, we will use 3 biomarkers that are most correlated to outcome and visualize theirs mean values among each patient at 3D graph. So we take only these patients who had all of these 3 biomarkers tested at least once. If somebody was tested more than once, then mean value of all tests for each biomarker is calculated.

new_df  <- df

new_df <- new_df %>% select(PATIENT_ID, `neutrophils(%)`, `(%)lymphocyte`, albumin, outcome) %>%
    group_by(PATIENT_ID, outcome) %>%
    summarise(Neutrophils_mean = mean(`neutrophils(%)`, na.rm = TRUE), Lymphocyte_mean = mean(`(%)lymphocyte`, na.rm = TRUE), albumin_mean = mean(`albumin`, na.rm = TRUE))

new_df <- new_df %>% filter(!is.nan(Neutrophils_mean) & !is.nan(Lymphocyte_mean) & !is.nan(albumin_mean))


mycolors <- c('royalblue1', 'darkcyan')
new_df$color <- mycolors[ as.numeric(new_df$outcome) ]

par(mar=c(0,0,0,0))
plot3d( 
    x=new_df$Neutrophils_mean, y=new_df$Lymphocyte_mean, z=new_df$albumin_mean, 
    col = new_df$color, 
    type = 's', 
    radius = 1,
    legend=TRUE,
    xlab="Neutrophils(%)", ylab="Lymphocyte(%)", zlab="Albumin")
legend3d("topright", legend = c('Death', 'Survival'), pch = 10, col = mycolors, cex=0.8, inset=c(0.02))

writeWebGL( filename="3d_correlation_mean.html" ,  width=600, height=600)

Data after cleaning and transformation:

tbl_summary(
    new_df %>% ungroup() %>%rename(`Neutrophils(%)` = Neutrophils_mean, `Lymphocyte(%)` = Lymphocyte_mean, `Albumin`= albumin_mean) %>% select(-PATIENT_ID, -color),
    by = outcome
) %>%
    add_n() %>%
    modify_header(label = "**Mean value of**") %>%
    add_overall() %>%
    bold_labels() 
Mean value of Overall, N = 3541 N Death, N = 1621 Survival, N = 1921
Neutrophils(%) 77 (63, 91) 354 91 (87, 94) 66 (56, 72)
Lymphocyte(%) 15 (5, 26) 354 5 (3, 8) 25 (18, 31)
Albumin 33.1 (29.1, 37.0) 354 28.9 (26.3, 31.2) 36.5 (33.9, 39.4)

1 Statistics presented: Median (IQR)

htmltools::includeHTML("./3d_correlation_mean.html")
RGL model

You must enable Javascript to view this page properly.


Drag mouse to rotate model. Use mouse wheel or middle button to zoom it.

Object written from rgl 0.100.54 by writeWebGL.

Most common biomarkers

biomarker_popularity <- df %>% group_by(PATIENT_ID, outcome) %>% summarise_at(vars(`Hypersensitive cardiac troponinI`:creatinine), function(x) sum(!is.na(x)))
biomarker_popularity$sum_of_single_test <- rowSums(biomarker_popularity %>% ungroup() %>% select(-PATIENT_ID, -outcome))

most_tested_death <- biomarker_popularity %>% select(sum_of_single_test, outcome, `neutrophils(%)`,  `(%)lymphocyte`, albumin) %>% filter(outcome=='Death') %>% arrange(desc(sum_of_single_test)) %>% head(10)
most_tested_survival <- biomarker_popularity %>% select(sum_of_single_test, outcome, `neutrophils(%)`,  `(%)lymphocyte`, albumin) %>% filter(outcome=='Survival') %>% arrange(desc(sum_of_single_test)) %>% head(10)

foo <- rbind(most_tested_death, most_tested_survival)

patients_tests <- merge(foo %>% ungroup() %>% select(PATIENT_ID), df %>% ungroup() %>% select(PATIENT_ID, RE_DATE,`neutrophils(%)`,  `(%)lymphocyte`, albumin, outcome),  by="PATIENT_ID") %>% filter(!is.na(`neutrophils(%)`) |  !is.na(`(%)lymphocyte`) | !is.na(albumin))

neutrophils_seq <- patients_tests %>% filter(!is.na(`neutrophils(%)`)) 

neutrophils_seq %>% mutate(PATIENT_ID=as.factor(PATIENT_ID)) %>%
  ggplot( aes(x=RE_DATE, y=`neutrophils(%)`, group=PATIENT_ID, color=PATIENT_ID)) +
    geom_line() +
    geom_point() +
  facet_grid(rows=vars(outcome)) +
    ggtitle("Neutrophils (%) during patient hospitalization") +
    theme_ipsum() +
    ylab("Neutrophils (%)") +
    transition_reveal(RE_DATE)

#anim_save("neutrophils_seq.gif")

lymphocyte_seq <- patients_tests %>% filter(!is.na(`(%)lymphocyte`)) 

lymphocyte_seq %>% mutate(PATIENT_ID=as.factor(PATIENT_ID)) %>%
  ggplot( aes(x=RE_DATE, y=`(%)lymphocyte`, group=PATIENT_ID, color=PATIENT_ID)) +
    geom_line() +
    geom_point() +
  facet_grid(rows=vars(outcome)) +
    ggtitle("Lymphocyte (%) during patient hospitalization") +
    theme_ipsum() +
    ylab("Lymphocyte (%)") +
    transition_reveal(RE_DATE)

#anim_save("lymphocyte_seq.gif")

albumin_seq <- patients_tests %>% filter(!is.na(albumin)) 

albumin_seq %>% mutate(PATIENT_ID=as.factor(PATIENT_ID)) %>%
  ggplot( aes(x=RE_DATE, y=albumin, group=PATIENT_ID, color=PATIENT_ID)) +
    geom_line() +
    geom_point() +
  facet_grid(rows=vars(outcome)) +
    ggtitle("Albumin value during patient hospitalization") +
    theme_ipsum() +
    ylab("Albumin") +
    transition_reveal(RE_DATE)

#anim_save("albumin_seq.gif")

Prediction model

#Get last test of each patient for crucial biomarkers

#df %>% select(PATIENT_ID, RE_DATE, age, gender, `neutrophils(%)`, `(%)lymphocyte`, albumin, outcome) %>%
#  group_by(PATIENT_ID, age, gender, outcome) %>% 
#  fill(`neutrophils(%)`, `(%)lymphocyte`, albumin) %>% 
#  summarise_at(vars(`neutrophils(%)`:albumin), function(x) last(x,order_by = !is.na(x))) %>%
#  filter(!is.na(`neutrophils(%)`) &  !is.na(`(%)lymphocyte`) & !is.na(albumin))